#!/usr/bin/env python
# coding: utf-8

# In[1]:


exec(open('init_path.py').read())
exec((P_Lib/'GasStation.py').read_text())
get_ipython().run_line_magic('matplotlib', 'inline')


# ### OD Pairs (within 130km direct distance) to Stata

# In[11]:


df = read_hdf(P_GS_Data / 'GS' / 'gas_station_distance.h5', 'GS')
df.head(2)


# In[12]:


df.shape


# In[13]:


print(df[(df.O==0)&(df.D==1)])
print(df[(df.O==1)&(df.D==0)])


# In[14]:


# restrict to pairs with direct distance less than 130k
df = df[df.Direct<=130_000]
df.shape


# In[46]:


GS = read_hdf(f_GS, 'GS')


# In[17]:


GS = GS[GS.Lat.notnull()]
dict_GS_Lat = GS.set_index('StID').Lat.to_dict()
dict_GS_Long = GS.set_index('StID').Lng.to_dict()


# In[18]:


df['OLat'] = df.O.map(dict_GS_Lat)
df['OLong'] = df.O.map(dict_GS_Long)
df['DLat'] = df.D.map(dict_GS_Lat)
df['DLong'] = df.D.map(dict_GS_Long)


# In[20]:


df.head(2)


# In[21]:


df.shape[0]


# In[23]:


# In[26]:


unit_size = df.shape[0]//10
unit_size


# In[44]:


for i in range(10):
    print(i,i*unit_size,(i+1)*unit_size)
    f = P_GS_Data / 'GS' / ('station_pair_germany_'+str(i)+'.dta')
    df.iloc[i*unit_size:(i+1)*unit_size].to_stata(f, write_index=False)    


# In[ ]:


# OD Pairs are divided into 10 parts: station_pair_germany_0 to 9
# Then run OSRTime stata code for each part
# Then the results are saved into the SAME dta file


# In[ ]:


# RUN OSRTIME ON STATA


# In[2]:


# df = read_stata(P_GS_Data / 'GS' / 'station_pair_germany.dta')
# df.head(2)


# ### Combine all travel times 0-9 to one file

# In[2]:


# df = read_stata(P_GS_Data / 'GS' / 'station_pair_berlin.dta')
# df.head(2)


# In[3]:


ls = []
for f in (P_GS_Data / 'GS').glob('station_pair_germany_*.dta'):
    ls.append(read_stata(f))


# In[6]:


df = concat(ls)


# In[7]:


print(df.shape)
df.head(2)


# In[9]:


df[['O','D','duration']].to_hdf(P_GS_Data / 'GS' / 'travel_time_station_pair_germany.h5', 'GS', mode='w', complevel=9, complib='bzip2')


# ### Long TO Wide Table (RZ - Large RAM)

# In[4]:


# df = read_hdf(P_Research / 'GasStation' / 'travel_time_station_pair_germany.h5', 'GS')
df = read_hdf(P_GS_Data / 'GS' / 'travel_time_station_pair_germany.h5', 'GS')
df.head(2)


# In[5]:


df.shape


# In[6]:


# travel_time_station_pair_germany.h5 only have ODPairs < 130km
# for > 130km, use OLS to fill it
df_all = read_hdf(P_GS_Data / 'GS' / 'gas_station_distance.h5', 'GS')
print(df_all.shape)
df_all.head(2)


# ##### Fill long-gap station-pairs using OLS

# In[7]:


df = merge(df_all, df[['O','D','duration']], how='left')
print(df.head(2))
print(df.shape)


# In[8]:


import statsmodels.api as sm
X = np.array(df.loc[df.duration.notnull(), ['duration']].values)
X = sm.add_constant(X)
y = np.array(df.loc[df.duration.notnull(), ['Direct']].values)


# In[9]:


alpha, beta = np.linalg.lstsq(X, y, rcond=-1)[0].flatten()
alpha, beta


# In[10]:


# X.shape


# In[11]:


# df.loc[df.duration.notnull(), ['Direct']].values.shape


# In[12]:


# df = merge(df_all, df[['O','D','duration']], how='left')
# print(df.head(2))
# print(df.shape)


# In[13]:


df.loc[df.duration.isnull(), 'duration'] = alpha + beta * df.loc[df.duration.isnull(), 'Direct'].values
df.shape


# In[14]:


df.head(2)


# In[17]:


df[['O','D','duration']].to_hdf(P_GS_Data / 'GS' / 'travel_time_station_pair_germany_all_od.h5', 'GS', mode='w', complevel=9, complib='bzip2')


# #### Convert to Wide Table

# In[10]:


# Append Reverse Pair
df = df.append(df.rename(columns={'O':'D','D':'O'}))


# In[11]:


df.shape


# In[12]:


# Append Oneself
arr_stid = df.O.unique()
df_self = DataFrame(np.vstack([arr_stid,arr_stid]).T, columns=['O','D'])
df_self['duration'] = 0
df = df.append(df_self)
df.shape


# In[15]:


df.head(2)


# In[16]:


 version = 'all'


# In[ ]:






# In[20]:


df[(df.O==275)&(df.D==195)]


# In[21]:


df[(df.D==275)&(df.O==902)]


# In[22]:


# df = df.pivot(index='O', columns='D', values='duration')
# df.head(2)


# In[23]:


df = df.pivot(index='O', columns='D', values='duration')
df.head(2)


# In[24]:


# confirm no null value anywhere
df.isnull().any(axis=1).sum()


# In[25]:


# confirm index & columns are ordered
l1 = df.index.tolist()
l2 = df.columns.tolist()
for v1, v2 in zip(l1,l2):
    if v1!=v2:
        print(v1, v2)


# In[26]:


# convert to minutes
df = df/60
df.head(2)


# In[ ]:





# In[ ]:





# In[27]:


# OUTPUT: large wide table N x N in CSV
# Matlab is bad with reading large matrices: divide into 10 files
# n_cols_each_file = 2000
# for i in range(df.shape[0]//n_cols_each_file+1):
#     print(i)
#     df.iloc[:,i*n_cols_each_file:(i+1)*n_cols_each_file].to_csv(P_GS_Data / 'GS'  / 'd'+str(i)+'.csv', index=False, header=False)
n = 10
ls = np.linspace(0, df.shape[0], num=n, dtype='int')
for i in range(n-1):
    print(i, ls[i], ls[i+1])
    df.iloc[:,ls[i]:ls[i+1]].to_csv(P_GS_Data / 'GS' / ('gstime_'+version) / ('d'+str(i)+'.csv'), index=False, header=False)


# In[28]:


# Store index (StID) to another file
Series(df.index).to_csv(P_GS_Data / 'GS' / ('gstime_'+version) /'distance_matrix_germany_stid.csv', index=False, header=False)


# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:




